Libraries

library(odbc)
library(DBI)
library(tidyverse)
library(plotly)
library(DT)
library(utf8)
library(mapdata)



Connect to the database

I’ll use “PostgreSQL Unicode(x64)” as my driver and I’ll load the username and password from environment variables.

con <- dbConnect(odbc(),
                 Driver   = "PostgreSQL Unicode(x64)",
                 Server   = "localhost",
                 Database = "mereghetti",
                 UID      = Sys.getenv('postgres_username'),
                 PWD      = Sys.getenv('postgres_psw'),
                 Port     = 5432)



Query and visualize data


1. Countries by number of films in the database

What countries have produced the most films in the database?

Here we need to take two important steps before calculating the total number of films for each country. First, we need to split and unnest the country column as it contains multiple values for films that were co-produced by multiple countries. Second, we are going to recode both the “Russian Federation” and “USSR” as Russia.


WITH CTE_ONE AS (
                 SELECT UNNEST(STRING_TO_ARRAY(country, ';')) unnested_country
                 FROM info
                        ),

CTE_TWO AS (
                SELECT CASE WHEN unnested_country IN ('Russian Federation', 'USSR') THEN 'Russia' 
                                ELSE unnested_country END AS country_recoded
                FROM CTE_ONE
                 )
            
SELECT country_recoded "Country",
       COUNT(*) :: INTEGER "Number of films"
FROM CTE_TWO
GROUP BY 1
ORDER BY 2 DESC 
datatable(all_countries, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
world <- map_data("world")

world <- world %>% 
         rename(Country = region) %>% 
         left_join(all_countries, by = c("Country" = "Country"))
                   

p <- world %>% 
  ggplot(aes(x = long, y = lat, group = group, label = Country , fill = `Number of films`)) +
  geom_polygon(color = "black", size = .1) +
  coord_fixed(1.3) +
  theme_classic() +
  scale_fill_gradient(low = "light blue", high = "dark blue", na.value = "light grey", guide = "none") +
  labs(title = "Countries by n.of films in the database") +
  theme(line = element_blank(),
        plot.title = element_text(size = 18),
        axis.title = element_blank(),
        axis.text = element_blank()) 

ggplotly(p)

About 82% of the films in the database had either an American, French or Italian production.

This result can be chalked up to: a) Hollywood’s dominance of film industry; b) Mereghetti’s strong interest in French culture; c) Mereghetti’s Italian nationality.



2. Average runtime over time


Have films got longer over time?


SELECT year, 
       ROUND(AVG(runtime) :: NUMERIC, 2) average_runtime 
FROM facts 
GROUP BY 1 
ORDER BY 1
datatable(avg_runtime, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- avg_runtime %>% 
  ggplot(aes(x = year, y = average_runtime)) +
  geom_line(color = "sky blue", size = .75) +
  scale_x_continuous(n.breaks = 10) +
  scale_y_continuous(n.breaks = 10) +
  theme_classic() +
  labs(title = "Average film runtime by year",
       y = "minutes") +
  theme(plot.title = element_text(size = 18),
        plot.subtitle = element_text(size = 10),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size = 10),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 10), 
        line = element_blank())

ggplotly(p)


There has been a sudden spike in the average runtime these last three years.

3. Frequency distribution of Mereghetti’s ratings


How difficult is it for a film to receive the highest rating from Paolo Mereghetti?


SELECT rating :: INTEGER "Rating",
       COUNT(*) :: INTEGER "Number of films",
       ROUND(COUNT(*) :: NUMERIC / (SELECT COUNT(*) FROM facts), 3) "Ratio"
FROM facts
GROUP BY 1
p <- all_ratings %>% 
  mutate(Ratio = scales::percent(Ratio)) %>% 
  ggplot(aes(x = Rating, y = `Number of films`)) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = Ratio), nudge_y = 230, size = 3.5) +
  theme_classic() +
  scale_x_continuous(breaks = c(1, 8)) +
  scale_y_continuous(breaks = c(0, 3500, 7000)) +
  labs(title = "Distribution of Mereghetti's ratings",
       x = "Rating") +
  theme(plot.title = element_text(size = 18),
        line = element_blank(),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        axis.title.y = element_blank())

ggplotly(p)


About 60% of films fall into the lower half of Mereghetti’s rating spectrum and only 1.8% receive his highest score.



4. Frequency of Mereghetti’s four-star rated films over time


How has the number of four-star-rated films changed over time?

SELECT year "Year",
       COUNT(*) :: INTEGER "Number of films",
       STRING_AGG(title || ' (' || year || ')', ' | ') Titles
FROM facts
JOIN info 
USING(id)
WHERE rating = 8
GROUP BY 1
ORDER BY 1
four_star_by_year <- four_star_by_year %>% mutate_if(is.character, utf8_encode)

datatable(four_star_by_year, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))


p <- four_star_by_year %>% 
  ggplot(aes(x = Year, y = `Number of films`)) +
  geom_line(size = .75, colour = "sky blue") +
  theme_classic() +
  scale_x_continuous(n.breaks = 7) +
  scale_y_continuous(n.breaks = 8) +
  labs(title = "Frequency of four-star-rated films over time",
       subtitle = "The 1950s were the heyday of cinema according to Mereghetti") +
  theme(plot.title = element_text(size = 18),
        plot.subtitle = element_text(size = 12),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_text(size = 10), 
        line = element_blank())

ggplotly(p)


It looks like according to Mereghetti cinema peaked around the ’50s and kept going downhill from there.



5. Frequency of Mereghetti’s four-star rated films by decade


Let’s investigate this further by counting the number of four-star-rated films after grouping by decade.


SELECT CASE WHEN year >= 2020 THEN '2020s'
            WHEN year >= 2010 AND year < 2020 THEN '2010s'
            WHEN year >= 2000 AND year < 2010 THEN '2000s'
            WHEN year >= 1990 AND year < 2000 THEN '1990s'
            WHEN year >= 1980 AND year < 1990 THEN '1980s'
            WHEN year >= 1970 AND year < 1980 THEN '1970s'
            WHEN year >= 1960 AND year < 1970 THEN '1960s'
            WHEN year >= 1950 AND year < 1960 THEN '1950s'
            WHEN year >= 1940 AND year < 1950 THEN '1940s'
            WHEN year >= 1930 AND year < 1940 THEN '1930s'
            WHEN year >= 1920 AND year < 1930 THEN '1920s'
            WHEN year >= 1910 AND year < 1920 THEN '1910s'
            WHEN year >= 1900 AND year < 1910 THEN '1900s'
            WHEN year >= 1890 and year < 1900 THEN '1890s'
            ELSE 'Other' END AS "Decade",
        COUNT(*) :: INTEGER "Number of films",
        ROUND(COUNT(*) :: NUMERIC / (SELECT COUNT(*) FROM facts WHERE rating = 8), 4) "Ratio",
        STRING_AGG(title || ' (' || year || ')', ' | ') "Titles"
FROM facts
JOIN info 
USING(id)
WHERE rating = 8
GROUP BY 1
ORDER BY 1
four_star_by_decade <- four_star_by_decade %>% mutate_if(is.character, utf8_encode)

datatable(four_star_by_decade, rownames = FALSE, filter="top", options = list(pageLength = 3, scrollX=T))


p <- four_star_by_decade %>% 
  mutate(label = scales::percent(Ratio)) %>% 
  ggplot(aes(x = Decade, y = `Number of films`)) +
  geom_col(fill = "sky blue") +
  geom_text(aes(label = label), nudge_y = 3.5, size = 3.5) +
  theme_classic() +
  labs(title = "Frequency of four-star rated films by decade",
       y = "Absolute frequency") +
  theme(plot.title = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.text = element_text(size = 10),
        axis.ticks = element_blank(), 
        line = element_blank())

ggplotly(p)


This bar chart confirms our insight: the 1950s-1960s period was the best one for movies in Mereghetti’s opinion.



6. Frequency of four-star rated films by country


What countries have the most 4/4 rated films in the database?

In this case we need to split and unnest the ‘country’ column as it often contains multiple values separated by a semi-colon (a film can be co-produced by more than one country). As a result the percentages add up to more than 100%. For visualization’s sake, we’ll limit the results to the top 20 countries by frequency.


SELECT UNNEST(STRING_TO_ARRAY(country, ';')) "Country",
       COUNT(*)  :: INTEGER "Number of films",
       ROUND(COUNT(*) :: NUMERIC / (SELECT COUNT(*) FROM facts WHERE rating = 8), 4) "Ratio"
FROM info
JOIN facts 
USING(id)
WHERE rating = 8
GROUP BY 1
ORDER BY 2 DESC
LIMIT 20
four_star_by_country <- four_star_by_country %>% mutate_if(is.character, utf8_encode)

datatable(four_star_by_country, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))


p <- four_star_by_country %>% 
  mutate(label = scales::percent(Ratio),
         Country = fct_reorder(Country, `Number of films`)) %>% 
  ggplot(aes(x = Country, y = `Number of films`)) +
  geom_col(aes(fill = Ratio)) +
  geom_text(aes(label = label), size = 3.25, nudge_y = 13) +
  coord_flip() +
  theme_classic() +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = 'none') +
  labs(title = "Frequency of four star rated films by country",
       y = "Absolute frequency") +
  theme(legend.position = "None", 
        plot.title = element_text(size = 18), 
        plot.subtitle = element_text(size = 12),
        axis.text = element_text(size = 10),
        axis.title.y = element_blank(),
        axis.ticks = element_blank(),
        line = element_blank()) 

ggplotly(p)


The US were involved in about 4 four-rated films out of 10



7. Frequency of four-star rated films by country over time

Let’s count the number of four-star rated films for each country-decade combinations (for example, French films in the ’60s, Hong-Kong films in the 90s etc). For visualization’s sake we’ll focus only on the top 10 countries by number of four-star rated films (calculated with a dense rank).

In the first common table expression we’ll calculate the frequency of 4/4 films for each country-decade combinations and in the second CTE we’ll identify the top 10 countries by n. of 4/4. Finally we’ll filter the first CTE by only keeping data about those top 10 countries.

WITH CTE_ONE AS (
                  SELECT CASE WHEN year >= 2020 THEN '2020s'
                              WHEN year >= 2010 AND year < 2020 THEN '2010s'
                              WHEN year >= 2000 AND year < 2010 THEN '2000s'
                              WHEN year >= 1990 AND year < 2000 THEN '1990s'
                              WHEN year >= 1980 AND year < 1990 THEN '1980s'
                              WHEN year >= 1970 AND year < 1980 THEN '1970s'
                              WHEN year >= 1960 AND year < 1970 THEN '1960s'
                              WHEN year >= 1950 AND year < 1960 THEN '1950s'
                              WHEN year >= 1940 AND year < 1950 THEN '1940s'
                              WHEN year >= 1930 AND year < 1940 THEN '1930s'
                              WHEN year >= 1920 AND year < 1930 THEN '1920s'
                              WHEN year >= 1910 AND year < 1920 THEN '1910s'
                              WHEN year >= 1900 AND year < 1910 THEN '1900s'
                              WHEN year >= 1890 and year < 1900 THEN '1890s'
                              ELSE 'Other' END AS "Decade",
                         UNNEST(STRING_TO_ARRAY(country, ';')) "Country",
                         COUNT(*) :: INTEGER "Number of films",
                         STRING_AGG(title || ' (' || year || ')', ' | ') "Titles"
                  FROM facts
                  JOIN info USING(id)
                  WHERE rating = 8
                  GROUP BY 1, 2
                  ),
        

     CTE_TWO AS (SELECT UNNEST(STRING_TO_ARRAY(country, ';')) "Country",
                        DENSE_RANK() OVER(ORDER BY COUNT(*) DESC) rank
                 FROM facts
                 JOIN info USING (id)
                 WHERE rating = 8 
                 GROUP BY 1
                )
              
SELECT *
FROM CTE_ONE
WHERE "Country" IN (SELECT "Country" FROM CTE_TWO WHERE rank <= 10)
ORDER BY "Number of films" DESC
              
decade_country_combinations <- decade_country_combinations %>% mutate_if(is.character, utf8_encode)

datatable(decade_country_combinations, rownames = FALSE, filter="top", options = list(pageLength = 2, scrollX=T))
p <- decade_country_combinations %>% 
  ggplot(aes(x = Decade, y = Country, fill = `Number of films`)) +
  geom_tile() +
  scale_fill_gradient(low = "light blue", high = "dark blue") +
  theme_classic() +
  labs(title = "Frequency of four-star rated films by country over time") +
  theme(axis.title = element_blank(),
        line = element_blank(),
        plot.title = element_text(size = 18),
        axis.text = element_text(size = 10))

ggplotly(p)


According to Mereghetti, the US reached their peak during the later phases of the Golden Age of Hollywood (1950s) while France did so during the Nouvelle Vague (1960s)



8. Frequency of four-star rated films by genre


What are the genres with the highest number of four-star-rated films?

As a film can belong to more than one genre, we’ll split and unnest the ‘genre’ column the same way we did for the ‘country’ column. Again, the percentages will add up to more than 100%.


SELECT UNNEST(STRING_TO_ARRAY(genre, ';')) "Genre",
       COUNT(*) :: INTEGER "Number of films",
       ROUND(COUNT(*) :: NUMERIC / (SELECT COUNT(*) FROM facts WHERE rating = 8), 4) "Ratio" 
FROM info
JOIN facts
USING (id)
WHERE rating = 8
GROUP BY 1
ORDER BY 2 DESC
datatable(four_rated_by_genre, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- four_rated_by_genre %>% 
  mutate(label = scales::percent(Ratio),
         Genre = fct_reorder(Genre, `Number of films`)) %>% 
  ggplot(aes(x = Genre, y = `Number of films`, fill = Ratio)) +
  geom_col() +
  geom_text(aes(label = label), size = 3.25, nudge_x = -.1, nudge_y = 23) +
  coord_flip() +
  theme_classic() +
  labs(title = "Frequency of four-star rated films by genre",
       y = "Absolute frequency") +
  theme(plot.title = element_text(size = 18),
        axis.title.y = element_blank(),
        axis.text = element_text(size = 10),
        axis.ticks = element_blank(),
        legend.position = "None", 
        line = element_blank()) +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = "none")

ggplotly(p)


7 four-star-rated movies out of 10 are dramas



9. Frequency of four-star rated films by genre over time


SELECT CASE WHEN year >= 2020 THEN '2020s'
            WHEN year >= 2010 AND year < 2020 THEN '2010s'
            WHEN year >= 2000 AND year < 2010 THEN '2000s'
            WHEN year >= 1990 AND year < 2000 THEN '1990s'
            WHEN year >= 1980 AND year < 1990 THEN '1980s'
            WHEN year >= 1970 AND year < 1980 THEN '1970s'
            WHEN year >= 1960 AND year < 1970 THEN '1960s'
            WHEN year >= 1950 AND year < 1960 THEN '1950s'
            WHEN year >= 1940 AND year < 1950 THEN '1940s'
            WHEN year >= 1930 AND year < 1940 THEN '1930s'
            WHEN year >= 1920 AND year < 1930 THEN '1920s'
            WHEN year >= 1910 AND year < 1920 THEN '1910s'
            WHEN year >= 1900 AND year < 1910 THEN '1900s'
            WHEN year >= 1890 and year < 1900 THEN '1890s'
            ELSE 'Other' END AS "Decade",
       UNNEST(STRING_TO_ARRAY(genre, ';')) "Genre",
       COUNT(*) :: INTEGER "Number of films"
FROM facts
JOIN info USING (id)
WHERE rating = 8
GROUP BY 1, 2
ORDER BY 3 DESC
datatable(decade_genre, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- decade_genre %>% 
  ggplot(aes(x = Decade, y = Genre, fill = `Number of films`)) +
  geom_tile() +
  scale_fill_gradient(low = "light blue", high = "dark blue") +
  theme_classic() +
  labs(title = "N. of four-star rated films by genre over time") +
  theme(axis.title = element_blank(),
        line = element_blank(),
        plot.title = element_text(size = 18),
        axis.text = element_text(size = 10))

ggplotly(p)



10. Directors with most four-star-rated films


Who are the directors with the highest number of 4/4 rated films?

Once again splitting and unnesting will come to our rescue.


WITH CTE AS (
              SELECT UNNEST(STRING_TO_ARRAY(director, ';')) "Director",
                     DENSE_RANK() OVER(ORDER BY COUNT(*) DESC) "Rank",
                     COUNT(*) :: INTEGER "Number of films",
                     STRING_AGG(title || ' (' || year || ')', ' | ') "Titles"
              FROM facts
              JOIN info USING(id)
              JOIN people USING(id)
              WHERE rating = 8
              GROUP BY 1
) 

SELECT *
FROM CTE
WHERE "Rank" < 6
four_rated_by_director <- four_rated_by_director %>% mutate_if(is.character, utf8_encode)

datatable(four_rated_by_director, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- four_rated_by_director %>% 
  mutate(Director = fct_reorder(Director, `Number of films`)) %>% 
  ggplot(aes(x = `Number of films`, y = Director, fill = `Number of films`)) +
  geom_col() +
  theme_classic() +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = "none") +
  scale_x_continuous(breaks = c(0, 5, 10)) +
  labs(title = "Directors by n. of four-star-rated films") +
  theme(plot.title = element_text(size = 18),
        line = element_blank(),
        axis.title = element_blank(),
        axis.text.y = element_text(size = 10),
        axis.text.x = element_text(size = 10))

ggplotly(p)


All directors except Allen, Lynch, Scorsese and Eastwood have passed away



11. Actors with most film credits in four-star-rated-films



WITH CTE AS ( SELECT UNNEST(STRING_TO_ARRAY(actors, ';')) "Actor",
                     DENSE_RANK() OVER(ORDER BY COUNT(*) DESC) "Rank",
                     COUNT(*) :: INTEGER "Number of films",
                     STRING_AGG(title || ' (' || year || ')', ' | ') "Titles"
              FROM facts
              JOIN info USING(id)
              JOIN people USING(id)
              WHERE rating = 8
              GROUP BY 1
              )
              
SELECT * 
FROM CTE
WHERE "Rank" < 7
top_actors <- top_actors

datatable(top_actors, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- top_actors %>% 
  mutate(Actor = fct_reorder(Actor, `Number of films`)) %>% 
  ggplot(aes(x = `Number of films`, 
             y = Actor)) +
  geom_col(aes(fill = `Number of films`)) +
  theme_classic() +
  scale_x_continuous(breaks = c(0, 8, 15)) +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = "none") +
  labs(title = "Actors with most film credits in four-star-rated films") +
  theme(plot.title = element_text(size = 18),
        legend.position = "None",
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        axis.text.y = element_text(size = 10),
        axis.text.x = element_text(size = 10),
        line = element_blank())

ggplotly(p)


If some of these names sound obscure it’s because they mostly worked as extras. Bess Flowers, for example, was known as ‘The Queen of Hollywood extras’



12. Composers with most film credits in four-star-rated films


Is Ennio Morricone going to top the list of composers with most credits in 4/4 films?


WITH CTE AS ( SELECT UNNEST(STRING_TO_ARRAY(composer, ';')) "Composer",
                     DENSE_RANK() OVER(ORDER BY COUNT(*) DESC) "Rank",
                     COUNT(*) :: INTEGER "Number of films",
                     STRING_AGG(title || ' (' || year || ')', ' | ') "Titles"
               FROM facts 
               JOIN info USING(id)
               JOIN people USING(id)
               WHERE rating = 8
               GROUP BY 1
               )

SELECT *
FROM CTE
WHERE "Rank" < 6
top_composers <- top_composers %>% mutate_if(is.character, utf8_encode)

datatable(top_composers, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- top_composers %>% 
  mutate(Composer = fct_reorder(Composer, `Number of films`)) %>% 
  ggplot(aes(y = Composer,
             x = `Number of films`, 
             fill = `Number of films`)) +
  geom_col() +
  theme_classic() +
  scale_x_continuous(breaks = c(0, 4, 8)) +
  labs(title = "Composers with most film credits in four-star-rated films") +
  theme(plot.title = element_text(size = 18),
        plot.subtitle = element_text(size = 10),
        legend.position = "None",
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        axis.text.y = element_text(size = 10),
        axis.text.x = element_text(size = 10),
        line = element_blank()) +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = "none")


ggplotly(p)

Nino Rota scored most of Federico Fellini’s films.

These aren’t necessarily Paolo Mereghetti’s favorite composers. They’re simply the composers who show up most frequenly among the films he awarded his highest score.



13. Artists with most film credits in four-star-rated films


By “artist” I mean anybody: directors, producers, writers, composers, editors etc.

To calculate this, we’ll unpivot all columns referring to people.


WITH CTE AS (
SELECT UNNEST(ARRAY[director, actors, producers, writer, editor, cinematographer, production_designer, 
                    set_decorator, composer, sound, costumes, make_up]) AS artists
FROM facts
JOIN people
USING(id)
WHERE rating = 8),

CTE_TWO AS (
            SELECT UNNEST(STRING_TO_ARRAY(artists, ';')) "Artist", 
                   DENSE_RANK() OVER(ORDER BY COUNT(*) DESC) "Rank",
                   COUNT(*) :: INTEGER "Number of films"
            FROM CTE
            GROUP BY 1
            )
            
SELECT *
FROM CTE_TWO
WHERE "Rank" < 11
top_artists <- top_artists %>% mutate_if(is.character, utf8_encode)

datatable(top_artists, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
p <- top_artists %>% 
  mutate(Artist = fct_reorder(Artist, `Number of films`)) %>% 
  mutate_if(is.character, utf8_encode) %>% 
  ggplot(aes(x = Artist, 
             y = `Number of films`, 
             fill = `Number of films`)) +
  geom_col() +
  coord_flip() +
  theme_classic() +
  labs(title = "People with most credits in four-star-rated films") +
  theme(legend.position = "None",
        plot.title = element_text(size = 18),
        plot.subtitle = element_text(size = 10),
        axis.title = element_blank(),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        axis.ticks = element_blank(),
        line = element_blank()) +
  scale_fill_gradient(low = "light blue", high = "dark blue", guide = "none") +
  scale_y_continuous(n.breaks = 3)

ggplotly(p)



Chaplin was jack of all trades: he produced, wrote, directed, edited, scored and starred in his films.



14. Average runtime by rating


Is there a relationship between runtime and rating? Perhaps film critics favor longer films.


SELECT rating :: INTEGER "Rating",
       ROUND(AVG(runtime) :: NUMERIC, 2) "Average runtime"
FROM facts
GROUP BY 1
p <- rating_vs_runtime %>% 
  ggplot(aes(x = Rating, y = `Average runtime`)) +
  geom_col(fill = "sky blue") +
  geom_text(aes(label = `Average runtime`), nudge_y = 2) +
  coord_cartesian(ylim = c(80, 120)) +
  scale_y_continuous(breaks = c(80, 120)) +
  scale_x_continuous(breaks = c(1, 8)) +
  theme_classic() +
  labs(title = "Average runtime by rating", 
       y = "Runtime",
       x = "Rating") +
  theme(plot.title = element_text(size = 18),
        axis.text = element_text(size = 11),
        line = element_blank()) 

ggplotly(p)


Mereghetti does seem to be slightly biased in favor of longer films.